library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)
library(p8105.datasets)
data("weather_df")

As a starting point, let’s revisit the scatterplot of tmax against tmin made in Visualization Pt 1.

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5)
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Labels

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package"
  )
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Scales

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15º C", "0", "15"))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Another way to do limits:

weather_df |> 
  filter(tmax > 10 , tmax < 30) |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15º C", "0", "15"))

######### Limits
weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15ºC", "0", "15"),
    limits = c(-20, 30)) + 
  scale_y_continuous(
    trans = "sqrt", 
    position = "right")
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_continuous(trans = "sqrt", position = "right"): sqrt
## transformation introduced infinite values.
## Warning: Removed 142 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Changing the colors manually using scale_color_hue

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15º C", "0", "15")) + 
  scale_color_hue(h=c(100,300))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

#  scale_color_hue(h=c(100,300))



#Better way - Jeff suggests always using viridis
ggp_temperature = 
weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15º C", "0", "15")) + 
  viridis::scale_color_viridis(
    name = "Location", 
    discrete = TRUE
  )


ggsave("weather_scatterplot.png", ggp_temperature)
## Saving 7 x 5 in image
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
#saving it as ggp_temperature so that you dont have to keep adding to this code specifically
#use this structure name = .... if you want to save specific graphics or if your code chunk is getting large 

Update my base plot

# Jeff's classic and go to 
ggp_temperature + 
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Another possibility 
ggp_temperature + 
  theme_dark() +
  theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

# And another! 
ggp_temperature + 
  theme_classic() + 
  theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

#they have packages where you can make it look like excel 2003 or wes anderson movies.

This is big!

Add this code at the begining of every RMD file

Setting Options

Jeff puts this in every to automatically set how he wants his figures to look

library(tidyverse)

knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)

theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)

scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Adding data in geoms

central_park_df = 
  weather_df |> 
  filter(name == "CentralPark_NY")


molokai_df =
  weather_df |> 
  filter(name == "Molokai_HI")


ggplot(data = molokai_df, aes(x = date, y = tmax, color = name)) + 
  geom_point()+
  geom_line(data = central_park_df)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

patchwork

Make three plots separatemes and combine using patchwork

ggp_tmax_tmin = 
  weather_df |> 
  ggplot(aes(x = tmin, y = tmax, colour = name)) +
  geom_point(alpha = 0.5) +
  theme(legend.position = "none")


ggp_prcp_density = 
  weather_df |> 
  filter(prcp > 0) |>                   #because there is that huge spike at 0. that we dont really care about 
  ggplot(aes(x = prcp, fill = name)) + 
  geom_density(alpha = .5) +
  theme(legend.position = "none")

ggp_temp_season =                       #shows seasonality
  weather_df |> 
  ggplot(aes(x = date, y = tmax, color = name)) + 
  geom_point(alpha = .5) +
  geom_smooth(se = FALSE) +
  theme(legend.position = "bottom") 

(ggp_tmax_tmin + ggp_prcp_density)                    # this is the frame work for patchwork: this plus this place next to one another
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

(ggp_tmax_tmin + ggp_prcp_density) / ggp_temp_season  #you can divide to see them stacked on top of one another
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Data Manipulation

Let’s make temperature violin plots.

weather_df |> 
  ggplot(aes(x = name, y = tmax, fill = name)) +
  geom_violin(alpha = 0.5)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

# factor variables - 

GGplot will assign number to strings based on alphabetical order. In order to change this, use the following mutate([] =fct_relevel([]))

weather_df |> 
  mutate(name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY", "Waterhole_WA"))) |> 
  ggplot(aes(x = name, y = tmax, fill = name)) +
  geom_violin(alpha = 0.5)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

#fct_relevel says take your name variable and put the values w/i in the following order (L->R) reads (1->3) mutate this so this becuase the new [name] variable



# Another way that Jeff does it (and more frequently)
weather_df |> 
  mutate(name = fct_reorder(name, tmax)) |> 
  ggplot(aes(x = name, y = tmax, fill = name)) +
  geom_violin(alpha = 0.5)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_reorder(name, tmax)`.
## Caused by warning:
## ! `fct_reorder()` removing 17 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
## Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

#fact_reorder especially helpful if you have a ton of categories 
#here, organize [name] by ascending tmax value

#Tidy the data pre ggploting What about data tidiness?

Suppose I want to ask does the distribution of BDI indiex change across visits? To do this we need a box plot

BUT – right now, or data are not structured this

pulse_df =
  haven::read_sas("data/public_pulse_data.sas7bdat") |>  
  janitor::clean_names() |> 
  pivot_longer(
    bdi_score_bl:bdi_score_12m,
    names_to = "visit",
    names_prefix = "bdi_score_",
    values_to = "bdi"
  ) |> 
  mutate(visit = replace(visit, visit == "bl", "00m"),
         visit = fct_inorder(visit)
        )  


#need to include haven:: becuase we didnt use library(haven) at begining 
# fct_inorder() saying order based on how it shows up in the dataset

In class exercise

How would I make the following graphic (look at the screen):

patchwork two different litters 7 on top and 8 on bottom dose (con, low, mod) on x axis pn_day y axis (day on which this happened, post natal days for…..)

Looking at four different things: ears, pivot, eyes, walk

faceting for these two: the groups (days of treatment) + outcome

Make a plot for the FAS study

pups_df = 
  read_csv("data/FAS_pups.csv", na = c("NA", ".", ""), skip = 3) |> 
  janitor::clean_names() |> 
  mutate(
    sex = case_match(
      sex,
      1 ~ "male",
      2 ~ "female"
    )
  )
## Rows: 313 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Litter Number
## dbl (5): Sex, PD ears, PD eyes, PD pivot, PD walk
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
litters_df = 
  read_csv("data/FAS_litters.csv", na = c("NA", ".", "")) |> 
  janitor::clean_names() |> 
  separate(group, into = c("dose","tx_day"), sep = 3)
## Rows: 49 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Group, Litter Number
## dbl (6): GD0 weight, GD18 weight, GD of Birth, Pups born alive, Pups dead @ ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fas_df = 
  left_join(pups_df, litters_df, by = "litter_number")

fas_df |> 
  select(pd_ears:tx_day) |> 
  pivot_longer(
    pd_ears:pd_walk,
    names_to = "outcome",
    names_prefix = "pd_",
    values_to = "pn_day"
  ) |> 
  mutate(outcome = fct_reorder(outcome, pn_day)) |> 
  drop_na() |> 
  ggplot(aes(x = dose, y = pn_day))+
  geom_violin() +
  facet_grid(tx_day ~ outcome)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `outcome = fct_reorder(outcome, pn_day)`.
## Caused by warning:
## ! `fct_reorder()` removing 44 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.

#Note that the ggplot code is only three lines long. Everything else is data manipulation!